
/*************************************************
Notes: 
This program pulls in the estab data and aggregates to the firm level
It adds in firm imports
It creates firm types
It assesses different share variables
It makes a firm dataset with firm type for use in limiting imports as appropriate for calculating facts
It makes a table of the firm types
**************************************************/

//Set directories
cd ""  /* PROJECT ROOT FOLDER */

//DEFINE GLOBALS PATHS HERE 
global data "data"
global junk "data/junk"
global lbd_data "path to LBD_c201600 version here"


set more off


cap program drop clean_census
program define clean_census
  syntax , year(integer) naicscode(str)

  use "$data/estabs_sales`year'.dta", clear
	rename *, lower	
	
  * make manuf sales
  gen sales_man = sales if substr(`naicscode',1,1) == "3"

    //Pick the best firmid 
      gen count=1
      tab count if firmid==""
      tab count if firmid_lbd==""
      tab count if firmid~=firmid_lbd
      tab count if firmid_census==""
      tab count if firmid_census~=firmid_lbd & firmid_census~="" & firmid_lbd~=""

  drop if missing(firmid) // analysis is at firm-level, these guys are useless, lbd-only sources


  tab source

  label variable emp "Emp from Census, or LBD if census missing"


  // Make plant-level productivity measures
  // **************************************
  gen vap=(va*1000)/emp
  replace vap=(gross_margin*1000)/emp if vap==.
  gen lvap=ln(vap)
  bysort `naicscode': egen emp_ind=sum(emp)
  gen wt_ind=emp/emp_ind
  gen lvap_t=wt_ind*lvap
  bysort `naicscode': egen lvap_ind=sum(lvap_t)
  gen lvap_r=lvap-lvap_ind

  //Make employment weights
  bysort firmid: egen emp_tot=sum(emp)
  gen wt=emp/emp_tot
  gen lvap_r_t=lvap_r*wt
  drop emp_tot emp_ind

  //Make sectoral employment totals for shares
  //Use naics codes (necessary for LBD data)
  gen n1=substr(`naicscode',1,1)
  gen n2=substr(`naicscode',1,2)
  gen emp_man=emp if n1=="3"
  gen emp_whole=emp if n2=="42"
  gen emp_retail=emp if n2=="44" | n2=="45"
  gen emp_services=emp if n1=="5" | n1=="6" | n1=="7" | n1=="8"
  gen emp_public=emp if n1=="9"
  gen emp_util=emp if n2=="22"
  gen emp_mining=emp if n2=="21"
  gen emp_agric=emp if n2=="11"
  gen emp_const=emp if n2=="23"
  gen emp_twh=emp if n2=="48" | n2=="49"
  gen emp_manage=emp if n2=="55"
  gen emp_prof=emp if n2=="54"
  gen estabs=1
  gen estabs_cmf=1 if source=="cmf"
  gen estabs_whole=1 if source=="cwh"
  gen estabs_manage=1 if n2=="55"
  destring optype, gen(optype_num)
  gen emp_broker=emp if optype_num>40 & optype_num<50

  //Make plant counts
  gen man_estabs=1 if n1=="3"

  //Make mining VA variable
  gen va_mining=va if n2=="21"
  /*
  if `year'==2002 {
	  replace va_mining = valaddm if source=="cmi"
	  }
  tab source if va~=.  
  */
  if `year'>=2002 {  //JD query: Is this the intended behavior? todo
	  replace va=valaddc if source=="ccn" 
  	}
  /*
  if `year'==2002 {
	  replace va  = valaddc if source=="ccn"
	  replace va = valaddm if source=="cmi"
	  }
  */
  //Make input variables
  gen input_whole=merch
  gen sales_other = sales if source~="cmf" & source~="cmi" & source~="ccn" 

  //Make inventory adjusted share measures
  gen del_inv= (fie+wie)-(fib+wib)
  gen del_miv= mie-mib

  gen input_man_va=sales-va //CMF, CCN, CMI

  //Create variable materials inputs
  /*
  if `year'==1997 {
    gen input_mats=cm if source=="cmf" | source=="cmi" | source=="ccn" 
    }
  if `year'==2002 {
	  gen input_mats = cm if source=="cmf" | source=="cmi" | source=="ccn" 
	  }
	  if `year'==2007 {
  */	

    gen input_mats=cm if source=="cmf" | source=="cmi" 
    replace input_mats=cp if  source=="ccn" 
    *}
      
  //Make total input variables
  egen tot_inputs_va=rowtotal(input_man_va input_whole ww) 
  egen tot_inputs=rowtotal(input_mats input_whole ww)
  gen tot_inputs_manuf=cm+ww if source=="cmf"
  gen tot_inputs_mw=cm+ww if  source=="cmf"
  replace tot_inputs_mw=merch if source=="cwh"
  egen tot_inputs_max =rowtotal(input_mats sales_other ww)
  
  //Make variable for firm manuf sales (including ipt)
    foreach var in cmf cwh {
      gen sales_`var'=sales if source=="`var'"
      }  
      gen sales_net=sales-ipt
      label variable sales_net "Sales-ipt"
    

  // ***LIMITATIONS TO SAMPLE FOR ALL SUBSEQUENT ANALYSIS****
  foreach var in sales emp {
    bysort firmid: egen firm_tot_`var'=sum(`var')
    }
    
  drop if firm_tot_sales<=0 & firm_tot_emp<=0   
  * note we want to keep plants that may have 0 emp but positive sales, because things may be happening between plants within a firm
  * at the firm level, we do want to drop those that have EITHER sales<=0 or emp<=0

  *Drop the Administrative Records to avoid using imputed data
  *changed this because the are important for importing firm counts 
  *drop if ar==1 

  *identify firm's main N4 industry
  tempfile tf_1
  preserve
  gen naics4=substr(`naicscode',1,4)
    replace naics4=substr(naics07,1,4) if naics4=="" & `year'==2007
    collapse (sum) sales , by(firmid naics4)
    sort firmid sales
    drop if sales==.
    bys firmid: gen keep=1 if _n==_N
    keep if keep==1
    drop keep 
    rename sales n4_main_sales
    rename naics4 naics4_primary
    save `tf_1'
  restore  
  
  
	* for lorenzo caliendo - number of unique NAICS 6 in manuf
	preserve
	keep if substr(`naicscode', 1,1) == "3"
	duplicates drop firmid `naicscode', force
	gen unique_manuf_ind = 1
	collapse(sum) unique_manuf_ind, by(firmid)

	save $junk/unique_naics`year'.dta, replace
	restore

  

  * collapse by firmid
  if `year' >1997 {
    replace ar=0 if ar==.
  }
  else {
    gen ar = .
  }
  gen inputs_ar = tot_inputs if ar==1
  gen inputs_not_ar = tot_inputs if ar==0

  * make ar variables for emp and sales
  gen ar_sales=sales if ar==1
  gen ar_emp=emp if ar==1

  collapse (sum) lvap_r=lvap_r_t emp* sales sales_net estabs* ar_estabs=ar ar_sales ar_emp tot_input* inputs_ar inputs_not_ar ipt sales_cmf sales_cwh cm ww pay_lbd sales_man, by(firmid)
  
    duplicates tag firmid, gen(dupes)
    tab dupes
	  
  * Add in the main N4 industry of the firm
    merge 1:1 firmid using `tf_1'
    drop _merge
    
    merge 1:1 firmid using $junk/unique_naics`year'
    drop _merge

	  /*  Label variables  */
	  label variable sales "Total firm sales across all establishments"
	  label variable sales_net "Total firm sales across all establishments, net of ipt"
	  label variable tot_inputs_va "Sales-va, merch, ww"
	  label variable tot_inputs "Materials, merch, ww"
	  label variable tot_inputs_manuf "Manuf materials & ww"
	  label variable tot_inputs_max "CCN CMF CMI cost of materials + CMF ww + sales in other sectors"
	  label variable naics4_primary "Main NAICS 4 of firm based on sales"
	  label variable sales_man "Sales of manuf estabs, by fk_naics12"
	  
	  *Restrict sample to firms with positive sales and employment 
	  drop if sales<=0 | emp<=0

	  
	  capture drop empavg
	  
  *Save firm-level dsataset with sales and inputs.
    save $data/firm_`year'_temp.dta, replace	



  * also get firm age
  import sas using "$lbd_data/firm_`year'_emp_c201600.sas7bdat", clear
  keep firmid firmage
  merge 1:1 firmid using $data/firm_`year'_temp.dta
  drop if _merge == 1
  drop _merge

  save $data/firm_`year'.dta, replace

  
  


end //end clean_census program

*clean_census, year(1997) naicscode(fk_naics02)
*clean_census, year(2002) naicscode(fk_naics02)
clean_census, year(2007) naicscode(fk_naics02)
*clean_census, year(2012) naicscode(fk_naics07)


